CHAPTER 1 Data Visualization with ggplot2

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

First Steps

Let’s use our first graph to answer a question: do cars with big engines use more fuel than cars with small engines? You probably already have an answer, but try to make your answer precise. What does the relationship between engine size and fuel efficiency look like? Is it positive? Negative? Linear? Nonlinear?

head(mpg)
## # A tibble: 6 × 11
##   manufacturer model displ  year   cyl trans      drv     cty   hwy fl    class 
##   <chr>        <chr> <dbl> <int> <int> <chr>      <chr> <int> <int> <chr> <chr> 
## 1 audi         a4      1.8  1999     4 auto(l5)   f        18    29 p     compa…
## 2 audi         a4      1.8  1999     4 manual(m5) f        21    29 p     compa…
## 3 audi         a4      2    2008     4 manual(m6) f        20    31 p     compa…
## 4 audi         a4      2    2008     4 auto(av)   f        21    30 p     compa…
## 5 audi         a4      2.8  1999     6 auto(l5)   f        16    26 p     compa…
## 6 audi         a4      2.8  1999     6 manual(m5) f        18    26 p     compa…
ggplot(data = mpg)+
  geom_point(mapping = aes(x = displ, y = hwy)) # create scatterplot

# graphing template
# ggplot(data = <DATA>) +
#   <GEOM_FUNCTION>(mapping = aes(<MAPPINGS>))

Aesthetic Mappings

# scaling: To map an aesthetic to a variable, associate the name of the aesthetic to the name of the variable inside aes(). ggplot2 will automatically assign a unique level of the aesthetic (here a unique color, size) to each unique value of the variable
ggplot(data = mpg)+
  geom_point(mapping = aes(x = displ, y = hwy, color = class)) #color

# geom_point(mapping = aes(x = displ, y = hwy, size = class))  #size
# geom_point(mapping = aes(x = displ, y = hwy, alpha = class)) #alpha 透明度
# geom_point(mapping = aes(x = displ, y = hwy, shape = class)) #shape, only 6 types of shape

# set the aesthetic properties
ggplot(data = mpg) +
      geom_point(mapping = aes(x = displ, y = hwy), color = "blue")

Facets

ggplot( data = mpg)+
  geom_point(mapping = aes(x = displ, y = hwy))+
  facet_wrap(~ class, nrow = 2)

#nrow:此选项指定了分面布局中的行数。例如,nrow = 2意味着分面布局将有两行面板。
#ncol:此选项指定了分面布局中的列数。例如,ncol = 3意味着分面布局将有三列面板。
#scales:定义是否为每个面板使用相同的坐标轴尺度或为每个面板使用不同的尺度。
#shrink:是否缩小面板以适应输出设备。
#labeller:用于控制如何显示分面标签的函数。
#as.table:控制面板的布局顺序。
#drop:是否删除没有数据的面板。

# only 2 variables
 ggplot(data = mpg) +
      geom_point(mapping = aes(x = displ, y = hwy)) +
      facet_grid(drv ~ cyl)

 #空单元格表示drv和cyl的某些组合在数据中不存在
 
  ggplot(data = mpg) +
      geom_point(mapping = aes(x = displ, y = hwy)) +
      facet_grid(. ~ cyl)

   ggplot(data = mpg) +
      geom_point(mapping = aes(x = displ, y = hwy)) +
      facet_grid(drv ~ .)

Geometric Objects

line chart: geom_line() boxplot: geom_boxplot() histogram: geom_histogram() area chart: geom_area()

# scatterplot
ggplot(data = mpg) +
      geom_point(mapping = aes(x = displ, y = hwy))

# loess
ggplot(data = mpg) +
      geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# different linetype by group
ggplot(data = mpg) +
      geom_smooth(mapping = aes(x = displ, y = hwy, linetype = drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# different lines and color by group
ggplot(data = mpg) +
      geom_smooth(mapping = aes(x = displ, y = hwy, group = drv, color = drv))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# different color by group
ggplot(data = mpg) +
  geom_smooth(
    mapping = aes(x = displ, y = hwy, color = drv),
    show.legend = T)
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# multiple geoms in same plot
ggplot(data = mpg)+
  geom_point(mapping = aes(x = displ, y = hwy))+
  geom_smooth(mapping = aes(x = displ, y = hwy))
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# prefer way for multiple geoms in same plot
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
  geom_point()+
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

#different aesthetic objects
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) +
  geom_point(aes(color = class))+
  geom_smooth()
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

# different data
ggplot(data = mpg, mapping = aes(x = displ, y = hwy)) + 
  geom_point(mapping = aes(color = class)) + 
  geom_smooth(
    data = filter(mpg, class == "subcompact"), 
    se = FALSE) #se - stander error
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

Statistical Transformations

ggplot(data = diamonds)+
  geom_bar(mapping = aes(x=cut))

ggplot(data = diamonds) +
      stat_count(mapping = aes(x = cut))

ggplot(data = diamonds) + 
  geom_bar(mapping = aes(x = cut, 
                         y = after_stat(prop), #这表示y轴显示的是每个cut级别的比例,而不是计数
                         group = 1)) #确保所有的条形的高度之和为1(或100%)

ggplot(data = diamonds) + 
  stat_summary(
    mapping = aes(x = cut, y = depth),
    fun.min = min,
    fun.max = max,
    fun = median
  )

Position Adjustments

ggplot(data = diamonds)+
  geom_bar(mapping = aes(x = cut, color = cut)) #边框颜色

ggplot(data = diamonds) +
      geom_bar(mapping = aes(x = cut, fill = cut)) #填充颜色

#每个条形都是堆起来的 垒起来的
ggplot(data = diamonds) +
      geom_bar(mapping = aes(x = cut, fill = clarity))

#每个条形是重叠的
#position = "identity"
#条形会直接绘制在它们在数据中的实际位置上。
#如果有多个条形共享同一个位置(例如,同一个x值),它们会重叠,而不是堆叠或并排。
ggplot(
  data = diamonds,
  mapping = aes(x = cut, fill = clarity))+
  geom_bar(alpha = 1/5, position = "identity")

ggplot(
  data = diamonds,
  mapping = aes(x = cut, color = clarity))+
geom_bar(fill = NA, position = "identity")

#position = "fill"
#条形会堆叠在一起,就像使用默认的position = "stack"一样。
#但与"stack"不同的是,"fill"会对每个堆叠的条形进行归一化,使得每个堆叠的总高度都是1或100%
ggplot(data = diamonds) +
  geom_bar(
    mapping = aes(x = cut, fill = clarity),
    position = "fill")

#position = "dodge"
#这意味着条形会并排放置,而不是堆叠
ggplot(data = diamonds) +
  geom_bar(mapping = aes(x = cut, fill = clarity), position = "dodge")

#position = "jitter"
#每个点的位置会稍微随机地偏移一点,这样即使多个点有相同或非常接近的x和y值,它们也不会完全重叠
#如果您有一个数据集,其中很多点都有相同或非常接近的值,那么直接绘制散点图可能会导致很多点重叠,使得很难看到每个点。在这种情况下,使用position = "jitter"可以帮助提高图形的可读性
ggplot(data = mpg) +
  geom_point(
    mapping = aes(x = displ, y = hwy),
    position = "jitter")

#geom_jitter():它为散点图中的点添加了一定量的随机偏移,以避免点的重叠。这对于有大量重叠点的数据集特别有用。
#geom_count():它也用于处理重叠的点,但它的方法是通过改变点的大小来表示每个位置上的点的数量。换句话说,它会计算每个位置上的点的数量,并使用大小不同的点来表示这些数量
ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
  geom_jitter() #width = 0.2, height = 0.2

ggplot(data = mpg, mapping = aes(x = cty, y = hwy)) +
  geom_count()

Coordinate Systems

# coord_flip() switches the x- and y-axes
ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
  geom_boxplot()

ggplot(data = mpg, mapping = aes(x = class, y = hwy)) +
  geom_boxplot() +
  coord_flip()

nz <- map_data("nz")
ggplot(nz, aes(long, #表示经度 x轴
               lat, #表示维度 x轴
               group = group)) + #这是一个特殊的映射,用于指示哪些点应该被连接在一起或被视为一个组。在绘制地图的多边形时,它确保每个地理区域的边界被正确地绘制
  #geom_polygon()用于绘制多边形
  geom_polygon(fill = "white", color = "black")

#coord_quickmap() 
ggplot(nz, aes(long, lat, group = group)) +
  geom_polygon(fill = "white", color = "black") +
  coord_quickmap()

#coord_quickmap()。这个函数调整了纵横比,使得地图更加准确地表示新西兰的形状。这是因为地球是一个球体,所以在二维地图上直接绘制经纬度可能会导致扭曲。coord_quickmap()提供了一个快速的方法来调整这种扭曲,尤其是对于较小的地区。

#cood_polar() - polar coordinates.
#Coxcomb图或玫瑰图
bar <- ggplot(data = diamonds) +
geom_bar(mapping = aes(x = cut, fill = cut), show.legend = FALSE,width = 1)+ #确保条形之间没有间隙
theme(aspect.ratio = 1) + #确保图的宽度和高度相等
labs(x = NULL, y = NULL)
bar + coord_flip()

bar + coord_polar()

#labs()
#labs(title = "汽车的重量与每加仑英里数的关系", x = "每加仑英里数", y = "汽车重量")
#labs(title = "汽车的重量与每加仑英里数的关系", subtitle = "基于mtcars数据集", caption = "数据来源:R内置数据集")

The Layered Grammar of Graphics

#ggplot(data = <DATA>) +
#  <GEOM_FUNCTION>(
#    mapping = aes(<MAPPINGS>),
#    stat = <STAT>,
#    position = <POSITION>
#)+ 
#<COORDINATE_FUNCTION> + 
#<FACET_FUNCTION>